package com.cmge.ad.spider.app.qihu;

import java.util.ArrayList;
import java.util.List;

import org.springframework.util.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import com.cmge.ad.model.App;
import com.cmge.ad.util.JsonUtil;
import com.cmge.ad.util.SuprUtil;

/**
 * @desc	360app抓取
 * 
 * @author	ljt
 * @time	2015-2-7 下午3:49:38
 */
public class QihuJsonProcessor implements PageProcessor {

	private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    public static final String URL_LIST = "\\w+page=\\w+";

    public static final String URL_POST = "\\w+soft_id/\\w+";
    
    private boolean listFlag = true;
    
    private static List<String> list;
    
    static{
    	list = new ArrayList<String>();
    	list.add("101587");
    	list.add("19");
    	list.add("20");
    	list.add("51");
    	list.add("52");
    	list.add("53");
    	list.add("54");
    }
    
    @Override
    public void process(Page page) {
    	if(listFlag){
    		for(String index : list){
    			String pageUrl = "http://zhushou.360.cn/list/index/cid/"+index;
    			for(int j = 1;j<=50;j++){
    				String pu = pageUrl + "?page=" + j;
    				page.addTargetRequest(pu);
    			}
    		}
    		listFlag = false;
    	}else if(page.getUrl().regex("page=").match()){
    		List<String> urlList = page.getHtml().xpath("//ul[@class='iconList']/li/a[1]/@href").all();
    		if(!SuprUtil.isEmptyCollection(urlList)){
    			page.addTargetRequests(urlList);
    		}
    	}else if(page.getUrl().regex("soft_id/").match()){
    		App app = new App();
        	
//        	String company = page.getHtml().xpath("//div[@class='app_content ellipsis']/a[@class='current']/@title").get();
        	String appName = page.getHtml().xpath("//h2[@id='app-name']/span/text()").get();
        	String iconUrl = page.getHtml().xpath("//dl[@class='clearfix']/dt/img/@src").get();
        	
//        	String downloadUrl = page.getHtml().xpath("//div[@class='price_bg downloading']/@data-appid").get();
        	
        	String appSize = page.getHtml().xpath("//div[@class='pf']/span[4]/text()").get();
        	String appVersion = page.getHtml().xpath("//div[@class='base-info']/table/tbody/tr[2]/td[1]/text()").get();
        	String updateTime = page.getHtml().xpath("//div[@class='base-info']/table/tbody/tr[1]/td[2]/text()").get();
//        	String packageName = page.getUrl().toString().split("=")[1];
        	String description = page.getHtml().xpath("//div[@class='breif']/text()").get();
        	if(StringUtils.isEmpty(description)){
        		description = page.getHtml().xpath("//div[@class='html-brief']/text()").get();
        	}
        	
        	List<String> picList = page.getHtml().xpath("//div[@class='overview']/img/@src").all();
        	if(SuprUtil.isEmptyCollection(picList)){
        		picList = page.getHtml().xpath("//div[@class='html-brief']//img/@src").all();
        	}
//        	List<String> authList = page.getHtml().xpath("//ul[@class='second-ul']/li/text()").all();
        	
        	app.setRemoteUrl(page.getUrl().toString());
        	app.setAppName(appName);
        	app.setIconUrl(iconUrl);
        	app.setAppSize(appSize);
        	app.setAppVersion(appVersion);
        	app.setUpdateTime(updateTime);
        	app.setDescription(description);
        	app.setPicList(picList);
        	app.setUniqueId(page.getUrl().toString());
        	
        	System.out.println(JsonUtil.toJson(app));
    	}
    }

    @Override
    public Site getSite() {
        return site;
    }
    
    public static void main(String[] args) throws Exception {
    	Spider qsSpider = Spider.create(new QihuJsonProcessor())
    					.addUrl("http://www.baidu.com")
    					.thread(5);
    	qsSpider.start();
    }
}
